library(tidyverse)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(lme4)## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(MuMIn)
library(lmerTest)##
## Attaching package: 'lmerTest'
##
## The following object is masked from 'package:lme4':
##
## lmer
##
## The following object is masked from 'package:stats':
##
## step
source("utils.R")
source("preprocess.R")## Joining, by = c("item_id", "item", "condition", "first_mention",
## "recent_mention", "knowledge_cue", "start", "end")
Mturk slightly oversampled to 1161. Only 13 ppts indicated they are not native english speakers.
summarise.exclusions(ppts.all)| Reason | Removed | (%) |
|---|---|---|
| ex.native_eng | 13 | 1.1 |
| ex.runtime | 0 | 0.0 |
| —— | NA | NA |
| Total Removed | 13 | 1.1 |
| Retained | 1143 | 98.9 |
57% of ppts passed both attention checks.
attention %>%
group_by(participant_id) %>%
summarize(accuracy = mean(accuracy), .groups="drop") %>%
group_by(accuracy) %>%
summarize(
n = n(),
prop = round(n / nrow(attention), 2)
)| accuracy | n | prop |
|---|---|---|
| 0.0 | 297 | 0.13 |
| 0.5 | 216 | 0.09 |
| 1.0 | 630 | 0.28 |
attention %>%
filter(is_correct == F) %>%
group_by(is_start_or_end) %>%
summarize(
n = n(),
prop = round(n / nrow(attention %>% filter(is_correct == F)), 2)
)| is_start_or_end | n | prop |
|---|---|---|
| FALSE | 606 | 0.75 |
| TRUE | 204 | 0.25 |
Items accuracy looks normal.
attention %>%
mutate(
item_question_id = paste0(item, "_", question_id)
) %>%
group_by(item_question_id) %>%
summarize(accuracy = mean(accuracy), .groups="drop") %>%
ggplot(aes(x = accuracy, y=reorder(item_question_id, -accuracy))) +
stat_summary(fun="mean", geom="bar")The distribution by item looks fairly normal but does lead to some extreme cases (e.g. no cases for top left, TB, 12).
critical %>%
filter(excluded.attention == F) %>%
ggplot(aes(x = factor(item), fill=condition)) +
geom_bar(stat="count", position = "dodge") +
facet_grid(cols=vars(first_mention), rows=vars(recent_mention),
labeller = "label_both")critical %>%
filter(excluded.attention == F) %>%
ggplot(aes(x = knowledge_cue, fill=condition)) +
geom_bar(stat="count", position = "dodge") +
facet_grid(cols=vars(first_mention), rows=vars(recent_mention),
labeller = "label_both")Accuracy is 80% for ppts who passed the attention checks (and 25% for those who didn’t).
critical %>%
ggplot(aes(x = excluded.attention, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar") +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
group_by(excluded.attention) %>%
summarize(accuracy=mean(accuracy), n=n(), .groups="drop")| excluded.attention | accuracy | n |
|---|---|---|
| FALSE | 0.8047619 | 630 |
| TRUE | 0.2553606 | 513 |
critical %>%
filter(excluded.attention == F) %>%
ggplot(aes(x = condition, y = accuracy, fill=condition)) +
stat_summary(fun="mean", geom="bar")critical %>%
filter(excluded.attention == F,
is_start | is_end) %>%
group_by(condition) %>%
summarize(start=mean(is_start), n=n(), .groups="drop")| condition | start | n |
|---|---|---|
| False Belief | 0.8964401 | 309 |
| True Belief | 0.2434211 | 304 |
critical <- critical %>%
mutate(
accuracy = ifelse(is_correct, 1, 0)
)None of the items look particularly easy/hard.
critical %>%
ggplot(aes(x = reorder(item, -accuracy), y = accuracy, color=excluded.attention)) +
stat_summary(fun="mean", geom="point") +
facet_grid(cols=vars(excluded.attention), labeller=label_both) +
scale_color_manual(values=c("#009933", "#FF0000"))The incorrect answers from retained ppts mostly look like genuine mistakes.
critical %>%
filter(is_correct == F,
excluded.attention == FALSE) %>%
select(participant_id, item_id, correct_answer, response, is_correct) %>%
arrange(item_id)| participant_id | item_id | correct_answer | response | is_correct |
|---|---|---|---|---|
| 1758 | 1_fb_1_e_s_ex | box | room | FALSE |
| 1893 | 1_fb_1_e_s_im | box | box but finds it is missing | FALSE |
| 415 | 1_tb_1_e_e_ex | basket | room | FALSE |
| 1411 | 1_tb_1_e_e_ex | basket | box | FALSE |
| 669 | 1_tb_1_e_s_ex | basket | box | FALSE |
| 1134 | 1_tb_1_e_s_ex | basket | box | FALSE |
| 1853 | 1_tb_1_s_s_im | basket | box | FALSE |
| 150 | 10_fb_1_e_e_im | toolbox | van | FALSE |
| 1969 | 10_fb_1_e_e_im | toolbox | van | FALSE |
| 314 | 10_fb_1_s_e_im | toolbox | van | FALSE |
| 645 | 10_fb_1_s_e_im | toolbox | van | FALSE |
| 1153 | 10_fb_1_s_e_im | toolbox | van | FALSE |
| 1409 | 10_fb_1_s_e_im | toolbox | van | FALSE |
| 1672 | 10_fb_1_s_s_im | toolbox | van | FALSE |
| 885 | 10_tb_1_e_e_ex | van | toolbox | FALSE |
| 640 | 10_tb_1_e_s_ex | van | toolbox | FALSE |
| 1425 | 10_tb_1_e_s_ex | van | toolbox | FALSE |
| 1580 | 10_tb_1_e_s_ex | van | toolbox | FALSE |
| 1844 | 10_tb_1_e_s_im | van | toolbox | FALSE |
| 914 | 10_tb_1_s_e_im | van | toolbox | FALSE |
| 1972 | 10_tb_1_s_e_im | van | toolbox | FALSE |
| 1641 | 11_fb_1_e_s_im | suitcase | backpack | FALSE |
| 1817 | 11_fb_1_s_e_im | suitcase | backpack | FALSE |
| 902 | 11_tb_1_s_e_ex | backpack | suitcase | FALSE |
| 1967 | 11_tb_1_s_e_im | backpack | suitcase | FALSE |
| 612 | 11_tb_1_s_s_ex | backpack | suitcase | FALSE |
| 1302 | 11_tb_1_s_s_im | backpack | suitcase | FALSE |
| 1423 | 11_tb_1_s_s_im | backpack | suitcase | FALSE |
| 1009 | 12_fb_1_e_s_im | stable | hut | FALSE |
| 1219 | 12_fb_1_s_s_im | stable | hut | FALSE |
| 1269 | 12_tb_1_e_s_im | hut | stable | FALSE |
| 209 | 12_tb_1_s_e_ex | hut | stable | FALSE |
| 520 | 12_tb_1_s_e_ex | hut | stable | FALSE |
| 1491 | 12_tb_1_s_e_ex | hut | stable | FALSE |
| 1837 | 12_tb_1_s_e_ex | hut | stable | FALSE |
| 1777 | 12_tb_1_s_s_ex | hut | stable | FALSE |
| 1835 | 2_fb_1_e_e_im | cupboard | sandwich | FALSE |
| 1839 | 2_fb_1_e_s_im | cupboard | fridge | FALSE |
| 339 | 2_tb_1_e_s_im | fridge | cupboard | FALSE |
| 112 | 2_tb_1_s_e_ex | fridge | cupboard | FALSE |
| 1528 | 2_tb_1_s_s_ex | fridge | cupboard | FALSE |
| 1963 | 2_tb_1_s_s_ex | fridge | cupboard | FALSE |
| 861 | 2_tb_1_s_s_im | fridge | cupboard | FALSE |
| 926 | 2_tb_1_s_s_im | fridge | cupboard | FALSE |
| 1804 | 2_tb_1_s_s_im | fridge | cupboard | FALSE |
| 159 | 3_fb_1_e_e_im | sink | basket | FALSE |
| 704 | 3_fb_1_s_s_ex | sink | stain | FALSE |
| 1416 | 3_tb_1_e_e_ex | basket | sink | FALSE |
| 247 | 3_tb_1_e_e_im | basket | sink | FALSE |
| 225 | 3_tb_1_e_s_ex | basket | sink | FALSE |
| 372 | 3_tb_1_s_s_ex | basket | sink | FALSE |
| 1298 | 3_tb_1_s_s_ex | basket | stain | FALSE |
| 1505 | 3_tb_1_s_s_im | basket | sink | FALSE |
| 1101 | 4_fb_1_e_e_im | shed | garage | FALSE |
| 518 | 4_fb_1_e_s_im | shed | garage | FALSE |
| 1947 | 4_fb_1_s_e_im | shed | garage | FALSE |
| 342 | 4_tb_1_e_e_ex | garage | shed | FALSE |
| 174 | 4_tb_1_e_e_im | garage | shed | FALSE |
| 408 | 4_tb_1_e_s_ex | garage | shed | FALSE |
| 349 | 4_tb_1_e_s_ex | garage | shed | FALSE |
| 1677 | 4_tb_1_e_s_ex | garage | shed | FALSE |
| 1417 | 4_tb_1_s_s_ex | garage | shed | FALSE |
| 2021 | 4_tb_1_s_s_ex | garage | shed | FALSE |
| 591 | 4_tb_1_s_s_im | garage | yard | FALSE |
| 2019 | 4_tb_1_s_s_im | garage | shed | FALSE |
| 371 | 5_fb_1_e_e_im | hall | study | FALSE |
| 366 | 5_fb_1_s_e_im | hall | bathroom | FALSE |
| 1464 | 5_fb_1_s_e_im | hall | study | FALSE |
| 121 | 5_tb_1_e_e_im | study | hall | FALSE |
| 1590 | 5_tb_1_e_s_ex | study | hall | FALSE |
| 1819 | 5_tb_1_e_s_ex | study | hall | FALSE |
| 309 | 5_tb_1_s_e_im | study | hall | FALSE |
| 1892 | 5_tb_1_s_s_ex | study | hall | FALSE |
| 1720 | 5_tb_1_s_s_im | study | hall | FALSE |
| 957 | 6_fb_1_s_e_ex | drawer | cabinet | FALSE |
| 1264 | 6_fb_1_s_e_ex | drawer | cabinet | FALSE |
| 258 | 6_fb_1_s_s_ex | drawer | ross | FALSE |
| 1930 | 6_fb_1_s_s_im | drawer | ross | FALSE |
| 839 | 6_tb_1_e_e_ex | cabinet | drawer | FALSE |
| 555 | 6_tb_1_e_e_im | cabinet | ross wanders | FALSE |
| 1684 | 6_tb_1_e_s_im | cabinet | drawer | FALSE |
| 178 | 6_tb_1_s_e_ex | cabinet | drawer | FALSE |
| 1077 | 6_tb_1_s_e_ex | cabinet | drawer | FALSE |
| 717 | 6_tb_1_s_e_im | cabinet | drawer | FALSE |
| 1018 | 6_tb_1_s_s_ex | cabinet | drawer | FALSE |
| 1376 | 7_fb_1_e_s_im | garage | fridge | FALSE |
| 930 | 7_fb_1_s_e_ex | garage | fridge | FALSE |
| 1667 | 7_fb_1_s_e_im | garage | fridge | FALSE |
| 183 | 7_fb_1_s_s_im | garage | fridge | FALSE |
| 1371 | 7_fb_1_s_s_im | garage | fridge | FALSE |
| 1063 | 7_tb_1_e_e_ex | fridge | garage | FALSE |
| 1253 | 7_tb_1_e_s_ex | fridge | garage | FALSE |
| 1809 | 7_tb_1_s_e_ex | fridge | garage | FALSE |
| 1928 | 7_tb_1_s_s_ex | fridge | garaage | FALSE |
| 895 | 8_fb_1_s_e_im | hall | bedroom | FALSE |
| 399 | 8_fb_1_s_s_ex | hall | bedroom | FALSE |
| 660 | 8_tb_1_e_e_ex | bedroom | good | FALSE |
| 1739 | 8_tb_1_e_e_ex | bedroom | hall | FALSE |
| 730 | 8_tb_1_e_e_im | bedroom | hall | FALSE |
| 1000 | 8_tb_1_e_e_im | bedroom | hall | FALSE |
| 941 | 8_tb_1_e_s_ex | bedroom | hall | FALSE |
| 1035 | 8_tb_1_e_s_ex | bedroom | hall | FALSE |
| 147 | 8_tb_1_s_e_ex | bedroom | hall | FALSE |
| 123 | 8_tb_1_s_e_im | bedroom | hall | FALSE |
| 574 | 8_tb_1_s_e_im | bedroom | hall | FALSE |
| 920 | 8_tb_1_s_e_im | bedroom | garden | FALSE |
| 1568 | 8_tb_1_s_s_ex | bedroom | hall | FALSE |
| 1833 | 8_tb_1_s_s_ex | bedroom | hall | FALSE |
| 1933 | 8_tb_1_s_s_ex | bedroom | hall | FALSE |
| 1806 | 9_fb_1_e_e_ex | cupboard | drawer | FALSE |
| 890 | 9_fb_1_e_e_im | cupboard | drawer | FALSE |
| 1828 | 9_fb_1_e_s_ex | cupboard | drawer | FALSE |
| 1873 | 9_fb_1_e_s_ex | cupboard | cabinet | FALSE |
| 1082 | 9_fb_1_s_e_im | cupboard | drawer | FALSE |
| 1245 | 9_fb_1_s_e_im | cupboard | drawer | FALSE |
| 566 | 9_tb_1_e_e_ex | drawer | cupboard | FALSE |
| 1715 | 9_tb_1_e_s_ex | drawer | cupboard | FALSE |
| 1124 | 9_tb_1_e_s_im | drawer | cupboard | FALSE |
| 844 | 9_tb_1_s_e_ex | drawer | cupboard | FALSE |
| 1235 | 9_tb_1_s_e_ex | drawer | cubpboard | FALSE |
| 846 | 9_tb_1_s_e_im | drawer | cupboard | FALSE |
| 1808 | 9_tb_1_s_s_ex | drawer | cupboard | FALSE |
| 230 | 9_tb_1_s_s_im | drawer | kitchen | FALSE |
First mention shows a noticeable effect. Effects of other vars look small.
critical %>%
ggplot(aes(x = condition, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
ggplot(aes(x = knowledge_cue, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
ggplot(aes(x = first_mention, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
mutate(start = ifelse(is_start, 1, 0)) %>%
ggplot(aes(x = first_mention, y = start, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
ggplot(aes(x = recent_mention, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))critical %>%
mutate(start = ifelse(is_start, 1, 0)) %>%
ggplot(aes(x = recent_mention, y = start, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))attention %>%
group_by(participant_id) %>%
summarize(
passage_reading_time = mean(passage_reading_time),
accuracy = mean(accuracy),
.groups="drop"
) %>%
ggplot(aes(x = passage_reading_time, y = accuracy)) +
# geom_point() +
stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.1) +
scale_x_log10() +
geom_smooth(method="lm", formula="y~x") +
labs(y = "attention_accuracy")## Warning: Removed 6 rows containing missing values (geom_segment).
attention %>%
ggplot(aes(x = reaction_time, y = accuracy)) +
# geom_point() +
stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.1) +
scale_x_log10() +
geom_smooth(method="lm", formula="y~x") +
labs(y = "Attention accuracy")## Warning: Removed 5 rows containing missing values (geom_segment).
critical %>%
group_by(participant_id, excluded.attention) %>%
summarize(
passage_reading_time = mean(passage_reading_time),
accuracy = mean(accuracy),
.groups="drop"
) %>%
ggplot(aes(x = passage_reading_time, y = accuracy, color=excluded.attention)) +
# geom_point() +
stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.2) +
scale_x_log10() +
geom_smooth(method="lm", formula="y~x", se=F) +
labs(y = "critical accuracy") +
scale_color_manual(values=c("#009933", "#FF0000"))## Warning: Removed 5 rows containing missing values (geom_segment).
critical %>%
ggplot(aes(x = reaction_time, y = accuracy, color=excluded.attention)) +
# geom_point() +
stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.2) +
scale_x_log10() +
geom_smooth(method="lm", formula="y~x", se=F) +
labs(y = "critical accuracy") +
scale_color_manual(values=c("#009933", "#FF0000"))## Warning: Removed 2 rows containing missing values (geom_segment).
Overall GPT-3 accuracy was 0.74.
df_fb_gpt3_dv %>%
ggplot(aes(x = condition, y = mdl.accuracy, fill=condition)) +
stat_summary(fun="mean", geom="bar")df_fb_gpt3_dv %>%
group_by(condition) %>%
summarize(accuracy=mean(mdl.accuracy), n=n(), .groups="drop")| condition | accuracy | n |
|---|---|---|
| False Belief | 0.7812500 | 96 |
| True Belief | 0.7083333 | 96 |
First, we ask whether condition predicts response, above
and beyond the other covariates excluding log_odds
from GPT-3.
Descriptively, we can ask whether a higher proportion of people respond with the START location in the FB or TB condition.
df_merged %>%
group_by(condition, knowledge_cue) %>%
summarise(prop_start = mean(is_start),
count = n(),
.groups="drop")| condition | knowledge_cue | prop_start | count |
|---|---|---|---|
| False Belief | Explicit | 0.9625000 | 160 |
| False Belief | Implicit | 0.8255034 | 149 |
| True Belief | Explicit | 0.2974684 | 158 |
| True Belief | Implicit | 0.1849315 | 146 |
df_merged %>%
ggplot(aes(x = condition,
y = is_start_numeric,
color = condition)) +
# geom_jitter(alpha = .1) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
geom= 'pointrange',
position=position_dodge(width=0.95)) +
labs(x = "Condition",
y = "P(START)") +
scale_color_viridis_d() +
theme_bw() +
facet_wrap(~knowledge_cue,
nrow = 2)model_all_but_lo = glmer(
is_start_numeric ~ condition + knowledge_cue+
recent_mention +
first_mention +
(1 + condition | item),
data = df_merged,
control=glmerControl(optimizer="bobyqa"),
family = binomial())
model_all_but_lo_and_condition = glmer(
is_start_numeric ~ knowledge_cue+
recent_mention +
first_mention +
(1 + condition | item),
data = df_merged,
control=glmerControl(optimizer="bobyqa"),
family = binomial())## boundary (singular) fit: see help('isSingular')
anova(model_all_but_lo, model_all_but_lo_and_condition)| npar | AIC | BIC | logLik | deviance | Chisq | Df | Pr(>Chisq) | |
|---|---|---|---|---|---|---|---|---|
| model_all_but_lo_and_condition | 7 | 567.8665 | 598.7950 | -276.9332 | 553.8665 | NA | NA | NA |
| model_all_but_lo | 8 | 538.1358 | 573.4827 | -261.0679 | 522.1358 | 31.7307 | 1 | 0 |
There is a significant effect of condition when accounting for log-odds.
model_all_fe = glmer(data = df_merged,
is_start_numeric ~ condition + knowledge_cue + log_odds +
recent_mention +
first_mention +
(1 + condition| item),
control=glmerControl(optimizer="bobyqa"),
family = binomial())
model_no_condition = glmer(data = df_merged,
is_start_numeric ~ knowledge_cue + log_odds +
recent_mention +
first_mention +
(1 + condition| item),
control=glmerControl(optimizer="bobyqa"),
family = binomial())
anova(model_all_fe, model_no_condition)| npar | AIC | BIC | logLik | deviance | Chisq | Df | Pr(>Chisq) | |
|---|---|---|---|---|---|---|---|---|
| model_no_condition | 8 | 567.8009 | 603.1478 | -275.9005 | 551.8009 | NA | NA | NA |
| model_all_fe | 9 | 539.3875 | 579.1528 | -260.6937 | 521.3875 | 30.41345 | 1 | 0 |
The full model shows a significant effect only for condition.
summary(model_all_fe)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +
## first_mention + (1 + condition | item)
## Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 539.4 579.2 -260.7 521.4 604
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2897 -0.4869 0.2331 0.3582 2.7109
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 0.1821 0.4268
## conditionTrue Belief 0.2735 0.5230 -1.00
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.45122 0.37135 6.601 4.09e-11 ***
## conditionTrue Belief -3.37187 0.36279 -9.294 < 2e-16 ***
## knowledge_cueImplicit -0.75039 0.31428 -2.388 0.017 *
## log_odds 0.04088 0.04694 0.871 0.384
## recent_mentionStart 0.31638 0.22503 1.406 0.160
## first_mentionStart -0.02224 0.23284 -0.096 0.924
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndtTB knwl_I lg_dds rcnt_S
## condtnTrBlf -0.790
## knwldg_cImp -0.635 0.507
## log_odds -0.502 0.547 0.669
## rcnt_mntnSt -0.255 -0.106 -0.001 -0.023
## frst_mntnSt -0.139 -0.172 -0.206 -0.267 0.035
The effect of LO approaches significance in the no_condition model.
summary(model_no_condition)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: is_start_numeric ~ knowledge_cue + log_odds + recent_mention +
## first_mention + (1 + condition | item)
## Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 567.8 603.1 -275.9 551.8 605
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3060 -0.5273 0.1979 0.3689 2.8797
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 6.809 2.609
## conditionTrue Belief 12.817 3.580 -1.00
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03996 0.45424 0.088 0.9299
## knowledge_cueImplicit -0.59347 0.31927 -1.859 0.0631 .
## log_odds 0.07446 0.05087 1.464 0.1433
## recent_mentionStart 0.28175 0.22012 1.280 0.2006
## first_mentionStart -0.05596 0.23089 -0.242 0.8085
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) knwl_I lg_dds rcnt_S
## knwldg_cImp -0.170
## log_odds 0.046 0.702
## rcnt_mntnSt -0.279 -0.012 -0.028
## frst_mntnSt -0.304 -0.227 -0.287 0.044
We can visualize this in a couple ways. First, we can look at the
residuals of a model without condition, and ask they’re
correlated with condition.
df_merged$resid = residuals(model_no_condition)
df_merged %>%
ggplot(aes(x = condition,
y = resid,
color = condition)) +
geom_jitter(alpha = .3) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
geom= 'pointrange',
position=position_dodge(width=0.95)) +
labs(x = "Condition",
y = "Residuals") +
geom_hline(yintercept = 0, linetype = "dotted") +
scale_color_viridis_d() +
theme_bw()Residuals are bimodal for all items in TB, and almost all items in FB.
df_merged$resid = residuals(model_no_condition)
df_merged %>%
ggplot(aes(x = condition,
y = resid,
color = condition)) +
geom_jitter(alpha = .3) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
geom= 'pointrange',
position=position_dodge(width=0.95)) +
labs(x = "Condition",
y = "Residuals") +
geom_hline(yintercept = 0, linetype = "dotted") +
scale_color_viridis_d() +
theme_bw() +
# facet_grid(rows=vars(knowledge_cue), cols=vars(first_mention)) +
facet_wrap(facets=vars(item))Residuals area also bimodal in all intersections of first mention, recent mention, and knowledge cue, although seems to be less bimodal within false belief for kc:implicit, first_mention:end, and recent_mention:start.
df_merged$resid = residuals(model_no_condition)
df_merged %>%
ggplot(aes(x = condition,
y = resid,
color = knowledge_cue)) +
geom_jitter(alpha = .5) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
geom= 'pointrange',
position=position_dodge(width=0.95)) +
labs(x = "Condition",
y = "Residuals") +
geom_hline(yintercept = 0, linetype = "dotted") +
# scale_color_viridis_d() +
theme_bw() +
facet_grid(rows=vars(recent_mention), cols=vars(first_mention), labeller=label_both) # facet_wrap(facets=vars(item))Another approach is to bin log-odds, and look at whether the probability of choosing the START location changes as a function of both binned log-odds and condition.
df_merged %>%
mutate(binned_lo = ntile(log_odds, n = 10)) %>%
ggplot(aes(x = binned_lo,
y = is_start_numeric,
color = condition)) +
stat_summary (fun = function(x){mean(x)},
fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
geom= 'pointrange',
position=position_dodge(width=0.95)) +
geom_smooth() +
labs(x = "Binned Log-odds",
y = "Residuals",
color = "Condition") +
scale_color_viridis_d() +
theme_bw() ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
model_no_condition = glmer(data = df_merged,
is_start_numeric ~ knowledge_cue + log_odds +
recent_mention +
first_mention +
(1 | item),
# control=glmerControl(optimizer="bobyqa"),
family = binomial())
model_all_but_lo_and_condition = glmer(
is_start_numeric ~ knowledge_cue+
recent_mention +
first_mention +
(1 | item),
data = df_merged,
control=glmerControl(optimizer="bobyqa"),
family = binomial())
anova(model_no_condition, model_all_but_lo_and_condition)| npar | AIC | BIC | logLik | deviance | Chisq | Df | Pr(>Chisq) | |
|---|---|---|---|---|---|---|---|---|
| model_all_but_lo_and_condition | 5 | 836.6595 | 858.7513 | -413.3298 | 826.6595 | NA | NA | NA |
| model_no_condition | 6 | 676.0707 | 702.5809 | -332.0354 | 664.0707 | 162.5888 | 1 | 0 |
model_all_fe_ixn = glmer(data = df_merged,
is_start_numeric ~ condition * knowledge_cue + log_odds +
recent_mention +
first_mention +
(1 + condition| item),
# control=glmerControl(optimizer="bobyqa"),
family = binomial())## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, :
## Model failed to converge with max|grad| = 0.00874747 (tol = 0.002, component 1)
model_all_fe = glmer(data = df_merged,
is_start_numeric ~ condition + knowledge_cue + log_odds +
recent_mention +
first_mention +
(1 + condition| item),
# control=glmerControl(optimizer="bobyqa"),
family = binomial())
anova(model_all_fe_ixn, model_all_fe)| npar | AIC | BIC | logLik | deviance | Chisq | Df | Pr(>Chisq) | |
|---|---|---|---|---|---|---|---|---|
| model_all_fe | 9 | 539.3875 | 579.1528 | -260.6937 | 521.3875 | NA | NA | NA |
| model_all_fe_ixn | 10 | 535.8245 | 580.0082 | -257.9123 | 515.8245 | 5.562938 | 1 | 0.0183446 |
summary(model_all_fe_ixn)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## is_start_numeric ~ condition * knowledge_cue + log_odds + recent_mention +
## first_mention + (1 + condition | item)
## Data: df_merged
##
## AIC BIC logLik deviance df.resid
## 535.8 580.0 -257.9 515.8 603
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.7277 -0.5524 0.1716 0.3926 2.5634
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 0.1820 0.4266
## conditionTrue Belief 0.2873 0.5360 -1.00
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.98773 0.48509 6.159 7.32e-10
## conditionTrue Belief -4.05664 0.50775 -7.990 1.35e-15
## knowledge_cueImplicit -1.52778 0.49231 -3.103 0.00191
## log_odds 0.06321 0.04650 1.359 0.17408
## recent_mentionStart 0.33454 0.22508 1.486 0.13720
## first_mentionStart -0.05278 0.23143 -0.228 0.81959
## conditionTrue Belief:knowledge_cueImplicit 1.25756 0.55584 2.262 0.02367
##
## (Intercept) ***
## conditionTrue Belief ***
## knowledge_cueImplicit **
## log_odds
## recent_mentionStart
## first_mentionStart
## conditionTrue Belief:knowledge_cueImplicit *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S
## condtnTrBlf -0.875
## knwldg_cImp -0.806 0.777
## log_odds -0.273 0.255 0.287
## rcnt_mntnSt -0.175 -0.100 -0.026 -0.018
## frst_mntnSt -0.135 -0.086 -0.093 -0.266 0.028
## cndtnTBl:_I 0.594 -0.686 -0.759 0.182 0.036 -0.051
## optimizer (Nelder_Mead) convergence code: 0 (OK)
## Model failed to converge with max|grad| = 0.00874747 (tol = 0.002, component 1)
ppts.all %>%
group_by(dyslexia, adhd, asd) %>%
summarize(n = n(),
prop = round(n / nrow(ppts.all), 2),
.groups="drop")| dyslexia | adhd | asd | n | prop |
|---|---|---|---|---|
| False | False | False | 822 | 0.71 |
| False | False | True | 39 | 0.03 |
| False | True | False | 149 | 0.13 |
| False | True | True | 12 | 0.01 |
| True | False | False | 86 | 0.07 |
| True | True | False | 16 | 0.01 |
| True | True | True | 32 | 0.03 |
ppts.all %>%
group_by(dyslexia, adhd, asd) %>%
summarize(n = n(),
prop = round(n / nrow(ppts.all), 2),
.groups="drop") %>%
ggplot(aes(x = dyslexia, y = n, fill=adhd)) +
geom_bar(stat="identity", position="dodge") +
facet_grid(cols=vars(asd), labeller="label_both") +
theme_minimal()df_merged <- df_merged %>%
merge(ppts.all, by.y="id", by.x="participant_id", all.y = F)
df_merged %>%
group_by(dyslexia, adhd, asd) %>%
summarize(n = n(),
correct = sum(accuracy),
accuracy = mean(accuracy),
.groups="drop")| dyslexia | adhd | asd | n | correct | accuracy |
|---|---|---|---|---|---|
| False | False | False | 530 | 452 | 0.8528302 |
| False | False | True | 10 | 6 | 0.6000000 |
| False | True | False | 49 | 31 | 0.6326531 |
| False | True | True | 4 | 3 | 0.7500000 |
| True | False | False | 16 | 11 | 0.6875000 |
| True | True | False | 1 | 1 | 1.0000000 |
| True | True | True | 3 | 3 | 1.0000000 |
Dyslexic participants perform worse.
critical <- critical %>%
merge(ppts.all %>% select(participant_id, dyslexia, adhd, asd, age, gender))
critical %>%
ggplot(aes(x = dyslexia, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))As do ppts with ADHD
critical %>%
ggplot(aes(x = adhd, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))And ASD
critical %>%
ggplot(aes(x = asd, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))Men and women perform similarly.
critical %>%
ggplot(aes(x = gender, y = accuracy, fill=excluded.attention)) +
stat_summary(fun="mean", geom="bar", position="dodge", alpha=0.8) +
stat_summary(fun.data=mean_cl_boot, geom="errorbar", position=position_dodge(0.9), width=0.2) +
scale_fill_manual(values=c("#009933", "#FF0000"))Ppts get better with age.
critical %>%
# group_by(participant_id, excluded.attention) %>%
# summarize(
# age = mean(age),
# accuracy = mean(accuracy),
# .groups="drop"
# ) %>%
ggplot(aes(x = age, y = accuracy, color=excluded.attention)) +
# geom_point() +
stat_summary_bin(fun.data = mean_cl_boot, geom="pointrange", binwidth = 0.05) +
scale_x_log10() +
geom_smooth(method="lm", formula="y~x", se=F) +
labs(y = "critical accuracy") +
scale_color_manual(values=c("#009933", "#FF0000"))## Warning: Removed 1 rows containing missing values (geom_segment).
Negative interaction of LO and ASD (b=-0.48, p=0.13)
model.asd = glmer(data = df_merged,
is_start_numeric ~ condition + knowledge_cue + log_odds +
recent_mention + first_mention + asd + asd:log_odds +
(1 + condition| item),
control=glmerControl(optimizer="bobyqa"),
family = binomial())
summary(model.asd)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +
## first_mention + asd + asd:log_odds + (1 + condition | item)
## Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 536.1 584.7 -257.0 514.1 602
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.3242 -0.4723 0.2283 0.3571 2.9976
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 0.1796 0.4238
## conditionTrue Belief 0.2790 0.5282 -0.97
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.40542 0.37051 6.492 8.46e-11 ***
## conditionTrue Belief -3.38406 0.36203 -9.348 < 2e-16 ***
## knowledge_cueImplicit -0.72545 0.31843 -2.278 0.0227 *
## log_odds 0.05554 0.04803 1.156 0.2475
## recent_mentionStart 0.33808 0.22765 1.485 0.1375
## first_mentionStart -0.04544 0.23564 -0.193 0.8471
## asdTrue 1.32252 0.67657 1.955 0.0506 .
## log_odds:asdTrue -0.18522 0.14063 -1.317 0.1878
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S asdTru
## condtnTrBlf -0.779
## knwldg_cImp -0.634 0.503
## log_odds -0.488 0.525 0.659
## rcnt_mntnSt -0.258 -0.110 0.000 -0.015
## frst_mntnSt -0.143 -0.167 -0.209 -0.267 0.032
## asdTrue -0.031 -0.034 0.031 0.019 0.034 -0.074
## lg_dds:sdTr 0.011 -0.006 0.016 -0.143 -0.030 -0.018 0.192
Significant negative interaction of LO and Dyslexia (b=-0.69, p=0.03)
model.dyslexia = glmer(data = df_merged,
is_start_numeric ~ condition + knowledge_cue + log_odds +
recent_mention + first_mention + dyslexia + dyslexia:log_odds +
(1 + condition| item),
control=glmerControl(optimizer="bobyqa"),
family = binomial())
summary(model.dyslexia)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +
## first_mention + dyslexia + dyslexia:log_odds + (1 + condition | item)
## Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 538.1 586.7 -258.1 516.1 602
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.5784 -0.4682 0.2263 0.3653 2.7943
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 0.1922 0.4384
## conditionTrue Belief 0.2408 0.4907 -0.96
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.45824 0.37879 6.490 8.6e-11 ***
## conditionTrue Belief -3.37323 0.36295 -9.294 < 2e-16 ***
## knowledge_cueImplicit -0.74579 0.32249 -2.313 0.0207 *
## log_odds 0.05664 0.04865 1.164 0.2443
## recent_mentionStart 0.30900 0.22676 1.363 0.1730
## first_mentionStart -0.04165 0.23547 -0.177 0.8596
## dyslexiaTrue 0.30779 0.65140 0.473 0.6366
## log_odds:dyslexiaTrue -0.30561 0.13464 -2.270 0.0232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S dyslxT
## condtnTrBlf -0.788
## knwldg_cImp -0.638 0.522
## log_odds -0.490 0.541 0.669
## rcnt_mntnSt -0.254 -0.108 -0.001 -0.024
## frst_mntnSt -0.131 -0.180 -0.215 -0.277 0.041
## dyslexiaTru -0.150 0.108 0.114 0.084 0.012 -0.071
## lg_dds:dysT -0.054 0.043 0.011 -0.148 0.011 0.034 -0.119
NS positive interaction of LO and ADHD (b=0.42, p=0.2)
model.adhd = glmer(data = df_merged,
is_start_numeric ~ condition + knowledge_cue + log_odds +
recent_mention + first_mention + adhd + adhd:log_odds +
(1 + condition| item),
control=glmerControl(optimizer="bobyqa"),
family = binomial())
summary(model.adhd)## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## is_start_numeric ~ condition + knowledge_cue + log_odds + recent_mention +
## first_mention + adhd + adhd:log_odds + (1 + condition | item)
## Data: df_merged
## Control: glmerControl(optimizer = "bobyqa")
##
## AIC BIC logLik deviance df.resid
## 539.7 588.3 -258.8 517.7 602
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.2656 -0.4773 0.2295 0.3601 2.8388
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## item (Intercept) 0.1992 0.4463
## conditionTrue Belief 0.2906 0.5391 -0.99
## Number of obs: 613, groups: item, 12
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.376816 0.375413 6.331 2.43e-10 ***
## conditionTrue Belief -3.376603 0.366220 -9.220 < 2e-16 ***
## knowledge_cueImplicit -0.729177 0.316779 -2.302 0.0213 *
## log_odds 0.046575 0.048214 0.966 0.3340
## recent_mentionStart 0.314904 0.225923 1.394 0.1634
## first_mentionStart -0.003348 0.235015 -0.014 0.9886
## adhdTrue 0.715835 0.397925 1.799 0.0720 .
## log_odds:adhdTrue -0.064117 0.103996 -0.617 0.5375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) cndtTB knwl_I lg_dds rcnt_S frst_S adhdTr
## condtnTrBlf -0.783
## knwldg_cImp -0.634 0.505
## log_odds -0.483 0.531 0.653
## rcnt_mntnSt -0.252 -0.107 -0.002 -0.026
## frst_mntnSt -0.132 -0.183 -0.210 -0.267 0.034
## adhdTrue -0.097 -0.031 0.030 -0.012 0.002 0.042
## lg_dds:dhdT -0.024 0.010 0.025 -0.196 0.012 -0.009 0.105
df_merged_summ = df_merged %>%
mutate(p_start_cond = 1/(1 + exp(-log_odds))) %>%
group_by(item_id, condition, knowledge_cue, recent_mention, first_mention) %>%
summarise(prop_start = mean(is_start),
lo = mean(log_odds),
accuracy = mean(is_correct),
p_start_gpt3 = mean(p_start_cond),
.groups="drop")df_merged_summ %>%
mutate("GPT-3\n(Proportion)" = p_start_gpt3,
"Human\n(Proportion)" = prop_start) %>%
pivot_longer(cols = c("Human\n(Proportion)", "GPT-3\n(Proportion)"),
names_to = "metric",
values_to = "value") %>%
ggplot(aes(x = value,
fill = condition)) +
geom_density(alpha = .5, color="#666666") +
theme_minimal() +
facet_wrap(. ~ metric,
# scales = "free",
ncol=1,
strip.position = "left") +
geom_vline(xintercept = .5, linetype = "dotted") +
theme(
legend.position = "bottom"
) +
scale_y_continuous(position="right") +
labs(
fill = "Knowledge State",
x = "P(Start)",
y = "Density"
) +
theme(axis.title = element_text(size=rel(2)),
axis.text = element_text(size = rel(2)),
legend.text = element_text(size = rel(2)),
legend.title = element_text(size = rel(2)),
strip.text.y = element_text(size = rel(2)))df_merged_summ %>%
mutate(
lo.correct = case_when(
condition == "False Belief" ~ lo,
T ~ -1 * lo,
)
) %>%
ggplot(aes(x = lo.correct, y = accuracy, color=condition, fill=condition)) +
geom_point(position=position_jitter(height=0.01), alpha=0.75) +
geom_smooth(method="lm", formula="y~x", alpha=0.15) +
theme_minimal() +
labs(
y = "Human Accuracy",
x = "GPT-3 Log-odds Ratio (Correct - Incorrect)",
fill = "Knowledge State",
color = "Knowledge State"
) +
theme(
legend.position = "top"
)r2 <- c(
r.squaredGLMM(model_all_but_lo_and_condition)[1],
r.squaredGLMM(model_no_condition)[1],
r.squaredGLMM(model_all_but_lo)[1],
r.squaredGLMM(model_all_fe)[1])## Warning: 'r.squaredGLMM' now calculates a revised statistic. See the help page.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
## Warning: the null model is correct only if all variables used by the original
## model remain unchanged.
model <- c(
"Base",
"Base + GPT-3",
"Base + Condition",
"Base + GPT-3 + Condition"
)
df.r2 <- data.frame(model, r2)
df.r2 %>%
ggplot(aes(x = r2, y = reorder(model, -r2))) +
geom_bar(stat="identity", fill = "#69c8ff") +
theme_minimal() +
labs(
x = bquote("Marginal"~R^2~""),
y = "Predictors"
)tga <- read.csv("token_generation_accuracy.csv")
tga| Model | Accuracy |
|---|---|
| text-babbage-001 | 0.3177083 |
| text-ada-001 | 0.3437500 |
| text-curie-001 | 0.4166667 |
| babbage | 0.4375000 |
| ada | 0.4427083 |
| curie | 0.4583333 |
| davinci | 0.6406250 |
| text-davinci-002 | 0.6979167 |
## Density version
tga %>%
ggplot(aes(x = reorder(Model, Accuracy), y=Accuracy, fill=Model)) +
geom_bar(stat="identity") +
labs(x = "Model",
y = "Accuracy",
fill = "Knowledge State") +
theme_minimal() +
scale_fill_viridis_d() +
theme(
legend.position = "none"
) +
theme(axis.title = element_text(size=rel(1.5)),
axis.text.x = element_text(size = rel(1.5), angle=40, hjust=1),
axis.text.y = element_text(size = rel(1.5)),
legend.text = element_text(size = rel(1.5)),
legend.title = element_text(size = rel(1.5)),
strip.text.x = element_text(size = rel(1.5)))tga %>%
ggplot(aes(y = reorder(Model, Accuracy), x=Accuracy, fill=Model)) +
geom_bar(stat="identity") +
labs(x = "Model",
y = "Accuracy",
fill = "Knowledge State") +
theme_minimal() +
scale_fill_viridis_d() +
theme(
legend.position = "none"
) +
coord_cartesian(xlim=c(0,1)) +
geom_vline(xintercept=0.83, linetype="dashed", color="#ff0000") +
theme(axis.title = element_text(size=rel(1.5)),
axis.text.x = element_text(size = rel(1.7)),
axis.text.y = element_text(size = rel(1.7)),
legend.text = element_text(size = rel(1.5)),
legend.title = element_text(size = rel(1.5)),
strip.text.x = element_text(size = rel(1.5)))ggsave("../Figures/textgen-model-acc.pdf", dpi = 300, width=7, height=5)